#############################################################################################
## Replication code for:                                                                   ##
## A Male Hostility Spiral? Polarized Communication among Political Elites on Social Media ##
## Step 2.                                                                                 ##
#############################################################################################

library(readxl)
library(tidyverse)
library(progress)
options(scipen=999)

user_data <- read_excel("user_data.xlsx")
all_tweets <- read.csv("all_tweets.csv")

all_tweets <- all_tweets[is.na(all_tweets$retweet_id),]
all_tweets <- all_tweets[!duplicated(all_tweets$id),]
all_tweets <- merge(all_tweets,user_data[c("user_id","country_id","country")],by="user_id",all.x=T)
all_tweets <- all_tweets[!is.na(all_tweets$country_id),]

#remove excess tags in beginning of tweet.
#just keep first @mention since that is whom the reply is to

excess_tags <- function(text) {
  sub("^(@\\S+)(\\s+@\\S+)+", "\\1", text)
}

all_tweets <- all_tweets %>%
  mutate(text_fixed = excess_tags(text)) 

create_data1 <- function(tweets,users,output_name="data1.csv"){
  data <- data.frame()
  for (country_id in unique(users$country_id)){
    print(country_id)
    
    tweets_cntry = tweets[tweets$country_id==country_id,]
    users_cntry = users[users$country_id==country_id,]

    temp <- direct_references(tweets_cntry$text_fixed,
                              tweets_cntry$user_screen_name,
                              unique(users_cntry$user_screen_name),
                              tweets_cntry$created_at,
                              tweets_cntry,
                              users,
                              tweets_cntry$id)
    temp$country_id <- country_id
    data <- rbind(data,temp)
    
  }
  write.csv(data,output_name)
}

direct_references <- function(text,Sender,references,date,tweets,users,id){
  m = length(references)
  network_data <- data.frame()
  pb <- progress_bar$new(
    format = "  [:bar] :percent :current/:total (:eta)",
    total = m, clear = FALSE, width = 60
  )
  
  for (i in 1:m){
    pb$tick()
    if (!is.na(references[i])){
      matches <- grepl(paste("@",references[i],sep=""),text,ignore.case=T)
      if (sum(matches)>0){
        raw <- data.frame(text=text[matches])
        raw$Receiver <- references[i]
        raw$sender <- Sender[matches]
        raw$date <- date[matches]
        raw$id <- id[matches]
        raw$likes <- tweets[matches,]$favorite_count
        raw$shares <- tweets[matches,]$retweet_count
        

        network_data <- rbind(network_data,raw)}
    }
    
  }
  network_data <- merge(network_data,users[c("user_screen_name","user_id")],by.x="Receiver",by.y="user_screen_name",all.x=T)
  colnames(network_data)[8] <- "Receiver_id"
  network_data <- merge(network_data,users[c("user_screen_name","user_id")],by.x="sender",by.y="user_screen_name",all.x=T)
  colnames(network_data)[9] <- "Sender_id"
  network_data <- merge(network_data,users[c("party","full_name","user_screen_name","gender")],by.x="Receiver",by.y="user_screen_name")
  colnames(network_data)[c(10,11,12)] <- c("receiver_party","receiver_name","receiver_gender")
  network_data <- merge(network_data,users[c("party","full_name","user_screen_name","gender")],by.x="sender",by.y="user_screen_name")
  colnames(network_data)[c(13,14,15)] <- c("sender_party","sender_name","sender_gender")  
  
  return(network_data)
}

create_data1(all_tweets,user_data,output_name = "data1.csv")

indirect_references <- function(text,references,tweets,users,users_cntry){
  m = length(references)
  network_data <- data.frame()
  pb <- progress_bar$new(
    format = "  [:bar] :percent :current/:total (:eta)",
    total = m, clear = FALSE, width = 60
  )
  
  for (i in 1:m){
    pb$tick()
    if (!is.na(references[i])){
      matches <- grepl(references[i],text,ignore.case=T)
      if (sum(matches)>0){
        raw <- data.frame(text=text[matches])
        
        raw$Receiver_id <- users_cntry$user_id[i]
        raw$Receiver_full_name <- users_cntry$full_name[i]
        raw$Sender_id <- tweets[matches,]$user_id
        raw$Sender_full_name <- tweets[matches,]$full_name
        raw$date <- tweets[matches,]$created_at
        raw$id <- tweets[matches,]$id
        
        
        network_data <- rbind(network_data,raw)}
    }
    
  }
  network_data <- merge(network_data,users[c("user_screen_name","user_id")],by.x="Receiver_id",by.y="user_id",all.x=T)
  colnames(network_data)[ncol(network_data)] <- "Reciver_username"
  network_data <- merge(network_data,users[c("user_screen_name","user_id")],by.x="Sender_id",by.y="user_id",all.x=T)
  colnames(network_data)[ncol(network_data)] <- "Sender_username"
  network_data <- merge(network_data,users[c("party","user_id","gender")],by.x="Receiver_id",by.y="user_id")
  colnames(network_data)[(ncol(network_data)-1):ncol(network_data)] <- c("receiver_party","receiver_gender")
  
  network_data <- merge(network_data,users[c("party","user_id","gender","full_name")],by.x="Sender_id",by.y="user_id")
  colnames(network_data)[(ncol(network_data)-2):ncol(network_data)] <- c("sender_party","sender_gender","Sender_full_name")  
  
  return(network_data)
}

create_data2 <- function(tweets,users,output_name="data2.csv"){
  data <- data.frame()
  for (country_id in unique(users$country_id)){
    print(country_id)
    
    tweets_cntry = tweets[tweets$country_id==country_id,]
    users_cntry = users[users$country_id==country_id,]
    
    users_cntry <- users_cntry %>%
      group_by(full_name) %>%
      filter(n() == 1) %>%
      ungroup()
    temp <- indirect_references(tweets_cntry$text_fixed,
                                users_cntry$full_name,
                                tweets_cntry,
                                users,
                                users_cntry)
    temp$country_id <- country_id
    data <- rbind(data,temp)
    
  }
  write.csv(data,output_name)
}
create_data2(all_tweets,user_data,output_name = "data2.csv")